# read in the data
nndb_flat <- read.csv("nndb_flat.csv")
# filter the data to contain only food groups with limited variables
nndb <- nndb_flat %>%
filter(FoodGroup %in% c("Vegetables and Vegetable Products", "Beef Products", "Sweets")) %>%
select("Energy_kcal" : "Zinc_mg")
# examine the correlation among the variables
ggcorr(nndb)
According to this correlation plot, high correlations can be observed among Folate_mcg, Thiamin_mg, Niacin_mg, and Riboflavin_mg. Other high positive correlations include VitA_mcg and Manganese_mg, Protein_g and Zinc_mg etc.
# scale the data
nndb_scaled <- scale(nndb)
# perform PCA on the data
pca_nndb <- prcomp(nndb_scaled,
center = FALSE,
scale. = FALSE)
summary(pca_nndb)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.3024 1.8455 1.6577 1.6205 1.41770 1.03167 0.9922
## Proportion of Variance 0.2305 0.1481 0.1195 0.1142 0.08739 0.04628 0.0428
## Cumulative Proportion 0.2305 0.3785 0.4980 0.6122 0.69960 0.74587 0.7887
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.90473 0.85456 0.8279 0.74550 0.71398 0.61241 0.58307
## Proportion of Variance 0.03559 0.03175 0.0298 0.02416 0.02216 0.01631 0.01478
## Cumulative Proportion 0.82426 0.85602 0.8858 0.90998 0.93214 0.94845 0.96323
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.46385 0.42317 0.38016 0.31599 0.27229 0.23596 0.22717
## Proportion of Variance 0.00935 0.00779 0.00628 0.00434 0.00322 0.00242 0.00224
## Cumulative Proportion 0.97258 0.98037 0.98665 0.99099 0.99422 0.99664 0.99888
## PC22 PC23
## Standard deviation 0.1440 0.07045
## Proportion of Variance 0.0009 0.00022
## Cumulative Proportion 0.9998 1.00000
# figure out how many components we need
var_explained_df <- data.frame(PC = 1:23,
var_explained = summary(pca_nndb)$importance[3,])
head(var_explained_df)
## PC var_explained
## PC1 1 0.23048
## PC2 2 0.37855
## PC3 3 0.49803
## PC4 4 0.61221
## PC5 5 0.69960
## PC6 6 0.74587
# draw a scree plot to show the cumulative proportion of the variance explained by each PC
var_explained_df %>%
ggplot(aes(x = PC,
y = var_explained,
group = 1)) +
geom_point() +
geom_line() +
labs(title="Scree plot: PCA",
y = 'Cumulative proportion of variance explained',
x = '')
# make 3 separate plots for the loadings of the first 3 PCs for all of the variables
# plot 1
pca_nndb_loadings1 <- as.data.frame(pca_nndb$rotation) %>%
select(PC1) %>%
mutate(variable = rownames(pca_nndb$rotation)) %>%
rename(loadings = PC1) %>%
arrange(desc(abs(loadings)))
ggplot(pca_nndb_loadings1,
aes(x = reorder(variable,
abs(loadings)),
y = loadings)) +
geom_bar(stat = 'identity') +
labs(x = "Variable",
y = "Loadings",
title = "The loadings for PC1") +
theme(axis.text.x = element_text(angle = 45,
hjust = 1))
# plot 2
pca_nndb_loadings2 <- as.data.frame(pca_nndb$rotation) %>%
select(PC2) %>%
mutate(variable = rownames(pca_nndb$rotation)) %>%
rename(loadings = PC2) %>%
arrange(desc(abs(loadings)))
ggplot(pca_nndb_loadings2,
aes(x = reorder(variable,
abs(loadings)),
y = loadings)) +
geom_bar(stat = 'identity') +
labs(x = "Variable",
y = "Loadings",
title = "The loadings for PC2") +
theme(axis.text.x = element_text(angle = 45,
hjust = 1))
# plot 3
pca_nndb_loadings3 <- as.data.frame(pca_nndb$rotation) %>%
select(PC3) %>%
mutate(variable = rownames(pca_nndb$rotation)) %>%
rename(loadings = PC3) %>%
arrange(desc(abs(loadings)))
ggplot(pca_nndb_loadings3,
aes(x = reorder(variable,
abs(loadings)),
y = loadings)) +
geom_bar(stat = 'identity') +
labs(x = "Variable",
y = "Loadings",
title = "The loadings for PC1") +
theme(axis.text.x = element_text(angle = 45,
hjust = 1))
# calculate pca scores
nndb2 <- nndb_flat %>%
filter(FoodGroup %in% c("Vegetables and Vegetable Products", "Beef Products", "Sweets")) %>%
select("Energy_kcal" : "Zinc_mg", FoodGroup)
pca_scores <- as.data.frame(pca_nndb$x)
pca_scores <- pca_scores %>%
mutate(Foodgroup = nndb2$FoodGroup)
head(pca_scores)
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## 1 -1.5612574 0.1937294 1.1700818 -0.7998864 0.6148276 -0.5457873 -0.2069789
## 2 -1.3007821 0.8347198 0.8815545 -0.6019031 1.3214529 0.2214579 -0.6822156
## 3 -1.3432660 0.7163900 0.7840115 -0.6382599 1.3465715 0.1562026 -0.6526562
## 4 -0.9146238 0.3311983 0.5176899 -0.3154010 0.6263561 -0.7459331 0.2627309
## 5 -1.1155612 0.1624408 0.6605511 -0.4675455 0.5507415 -0.6857603 0.2328749
## 6 -1.4313712 0.9494195 0.6796587 -0.3914938 1.2217494 -0.7864057 -0.1050359
## PC8 PC9 PC10 PC11 PC12 PC13
## 1 0.1524726 -0.09007289 -0.17746633 -0.006238066 0.26035911 -0.13734436
## 2 -0.2738827 0.41480350 -1.15353346 -0.834741903 0.30889039 0.55405858
## 3 -0.3294967 0.41116129 -1.12053038 -0.750164431 0.43450453 0.51107986
## 4 -0.2598225 0.13668647 0.03091109 0.085235135 0.73926198 0.81665649
## 5 -0.3205132 0.07326368 -0.05649271 0.383306591 0.97061799 0.66661165
## 6 0.1061583 0.01578496 -0.05514659 0.526141035 0.06173713 -0.07221577
## PC14 PC15 PC16 PC17 PC18 PC19
## 1 0.1447197 -0.01319467 0.25343790 -0.05399305 -0.063357584 0.14331689
## 2 0.2175621 -0.15177657 0.13224277 -0.03502885 0.007560999 0.03636869
## 3 0.1581587 -0.17852753 0.16240197 -0.03885139 0.028064327 0.04698692
## 4 -0.5076588 0.01783758 -0.22142992 -0.01680315 -0.060215737 -0.04360432
## 5 -0.3400429 -0.15117768 -0.06695674 -0.02667881 -0.108876946 0.01367921
## 6 0.3616194 -0.11685329 0.19543874 0.08548548 -0.074980182 -0.11627266
## PC20 PC21 PC22 PC23
## 1 -0.03535393 0.02027412 -0.02599203 0.0095088000
## 2 0.14105445 0.03600869 -0.24366859 0.0039909754
## 3 0.02773533 -0.00322706 -0.15091874 -0.0007238741
## 4 -0.08555712 0.04233724 0.21186150 -0.0098150379
## 5 -0.09059414 0.01726881 0.21349082 -0.0117930981
## 6 0.10389198 0.14936890 -0.05919124 -0.0013144711
## Foodgroup
## 1 Vegetables and Vegetable Products
## 2 Vegetables and Vegetable Products
## 3 Vegetables and Vegetable Products
## 4 Vegetables and Vegetable Products
## 5 Vegetables and Vegetable Products
## 6 Vegetables and Vegetable Products
# PC1 versus PC2
plot1 <- ggplot(pca_scores,
aes(x = PC1,
y = PC2,
col = Foodgroup)) +
geom_point() +
labs(title = "PC1 versus PC2")
ggplotly(plot1)
# PC1 versus PC3
plot2 <- ggplot(pca_scores,
aes(x = PC1,
y = PC3,
col = Foodgroup)) +
geom_point() +
labs(title = "PC1 versus PC3")
ggplotly(plot2)
# PC2 versus PC3
plot3 <- ggplot(pca_scores,
aes(x = PC2,
y = PC3,
col = Foodgroup)) +
geom_point() +
labs(title = "PC2 versus PC3")
ggplotly(plot3)
The Vegetables and Vegetable Products is the outlier.
nndb <- nndb_flat %>%
filter(FoodGroup %in% c("Vegetables and Vegetable Products", "Beef Products", "Sweets")) %>%
select("Energy_kcal" : "Zinc_mg") %>%
slice(-c(2108))
# scale the data
nndb_scaled <- scale(nndb)
# perform PCA on the data
pca_nndb <- prcomp(nndb_scaled,
center = FALSE,
scale. = FALSE)
summary(pca_nndb)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.4047 1.9174 1.6360 1.5452 1.05222 1.02786 0.92913
## Proportion of Variance 0.2514 0.1598 0.1164 0.1038 0.04814 0.04593 0.03753
## Cumulative Proportion 0.2514 0.4113 0.5276 0.6314 0.67959 0.72552 0.76306
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.91925 0.8348 0.78281 0.76116 0.72114 0.70611 0.59976
## Proportion of Variance 0.03674 0.0303 0.02664 0.02519 0.02261 0.02168 0.01564
## Cumulative Proportion 0.79980 0.8301 0.85674 0.88193 0.90454 0.92621 0.94185
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.58556 0.51769 0.43258 0.42121 0.35832 0.31552 0.27864
## Proportion of Variance 0.01491 0.01165 0.00814 0.00771 0.00558 0.00433 0.00338
## Cumulative Proportion 0.95676 0.96841 0.97655 0.98426 0.98985 0.99417 0.99755
## PC22 PC23
## Standard deviation 0.22669 0.07037
## Proportion of Variance 0.00223 0.00022
## Cumulative Proportion 0.99978 1.00000
# figure out how many components we need
var_explained_df <- data.frame(PC = 1:23,
var_explained = summary(pca_nndb)$importance[3,])
head(var_explained_df)
## PC var_explained
## PC1 1 0.25142
## PC2 2 0.41127
## PC3 3 0.52764
## PC4 4 0.63145
## PC5 5 0.67959
## PC6 6 0.72552
# draw a scree plot to show the cumulative proportion of the variance explained by each PC
var_explained_df %>%
ggplot(aes(x = PC,
y = var_explained,
group = 1)) +
geom_point() +
geom_line() +
labs(title="Scree plot: PCA",
y = 'Cumulative proportion of variance explained',
x = '')
# make 3 separate plots for the loadings of the first 3 PCs for all of the variables
# plot 1
pca_nndb_loadings1 <- as.data.frame(pca_nndb$rotation) %>%
select(PC1) %>%
mutate(variable = rownames(pca_nndb$rotation)) %>%
rename(loadings = PC1) %>%
arrange(desc(abs(loadings)))
ggplot(pca_nndb_loadings1,
aes(x = reorder(variable,
abs(loadings)),
y = loadings)) +
geom_bar(stat = 'identity') +
labs(x = "Variable",
y = "Loadings",
title = "The loadings for PC1") +
theme(axis.text.x = element_text(angle = 45,
hjust = 1))
# plot 2
pca_nndb_loadings2 <- as.data.frame(pca_nndb$rotation) %>%
select(PC2) %>%
mutate(variable = rownames(pca_nndb$rotation)) %>%
rename(loadings = PC2) %>%
arrange(desc(abs(loadings)))
ggplot(pca_nndb_loadings2,
aes(x = reorder(variable,
abs(loadings)),
y = loadings)) +
geom_bar(stat = 'identity') +
labs(x = "Variable",
y = "Loadings",
title = "The loadings for PC2") +
theme(axis.text.x = element_text(angle = 45,
hjust = 1))
# plot 3
pca_nndb_loadings3 <- as.data.frame(pca_nndb$rotation) %>%
select(PC3) %>%
mutate(variable = rownames(pca_nndb$rotation)) %>%
rename(loadings = PC3) %>%
arrange(desc(abs(loadings)))
ggplot(pca_nndb_loadings3,
aes(x = reorder(variable,
abs(loadings)),
y = loadings)) +
geom_bar(stat = 'identity') +
labs(x = "Variable",
y = "Loadings",
title = "The loadings for PC3") +
theme(axis.text.x = element_text(angle = 45,
hjust = 1))
# calculate pca scores
nndb2 <- nndb_flat %>%
filter(FoodGroup %in% c("Vegetables and Vegetable Products", "Beef Products", "Sweets")) %>%
select("Energy_kcal" : "Zinc_mg", FoodGroup) %>%
slice(-c(2108))
pca_scores <- as.data.frame(pca_nndb$x)
pca_scores <- pca_scores %>%
mutate(Foodgroup = nndb2$FoodGroup)
head(pca_scores)
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## 1 -1.7027718 0.2607578 1.2860692 -0.8668904 0.4841448 0.2741745 -0.02544208
## 2 -1.5052871 1.2300725 1.0861992 -1.4691289 0.5803701 -0.6902443 0.55954856
## 3 -1.5439091 0.9445542 1.0367858 -1.2475228 0.6285126 -0.7195206 0.32270083
## 4 -0.8775346 0.5019583 0.4594099 -0.7565559 0.1531686 0.6975467 -0.29997996
## 5 -1.1404633 0.2379715 0.6655091 -0.6574418 0.1887129 0.6061557 -0.25116044
## 6 -1.5987481 1.2533865 0.7337843 -1.3253662 0.8309932 0.3283124 -0.05066289
## PC8 PC9 PC10 PC11 PC12 PC13
## 1 -0.27230017 -0.1341082 -0.05507301 -0.1025362 0.1932623 0.09842453
## 2 -0.11789083 -1.3603008 -0.35910732 -0.5405831 -0.4009605 -0.96196729
## 3 0.13514244 -1.2581021 -0.35096053 -0.3340014 0.0610447 -0.76157568
## 4 0.25658976 -0.0177702 -0.23923311 -0.6799443 0.5433446 0.94847310
## 5 0.30133009 -0.1037168 0.08390061 -0.6953898 0.7044298 0.99451103
## 6 -0.05182628 -0.1072654 0.65232931 -0.1994536 -0.4172233 -0.04412458
## PC14 PC15 PC16 PC17 PC18 PC19
## 1 -0.2011960 0.02647699 0.17828992 -0.14199021 0.26892752 0.05198769
## 2 0.3356937 -0.30302057 0.21382488 -0.10268721 0.06832141 0.13057971
## 3 0.4687941 -0.23202731 0.36386401 -0.12280256 0.06295249 0.07041284
## 4 0.8219214 -0.19841998 0.05424900 0.19168445 -0.13419691 -0.10533007
## 5 0.6173174 -0.21761361 0.34030427 0.08109275 -0.04898600 -0.09887924
## 6 -0.2725223 0.09566712 0.05093138 -0.23563155 0.04788329 0.01674507
## PC20 PC21 PC22 PC23
## 1 -0.02388626 -0.07661584 0.045962709 -0.008914489
## 2 0.06830207 -0.21587178 0.057060265 -0.005327082
## 3 0.05518188 -0.17597528 0.041799704 -0.002353218
## 4 -0.06851258 0.24429998 0.001351966 0.012080095
## 5 -0.11980030 0.23091714 -0.013138326 0.013051678
## 6 -0.02876194 0.11643664 0.100532203 0.002740541
## Foodgroup
## 1 Vegetables and Vegetable Products
## 2 Vegetables and Vegetable Products
## 3 Vegetables and Vegetable Products
## 4 Vegetables and Vegetable Products
## 5 Vegetables and Vegetable Products
## 6 Vegetables and Vegetable Products
# PC1 versus PC2
plot1 <- ggplot(pca_scores,
aes(x = PC1,
y = PC2,
col = Foodgroup)) +
geom_point() +
labs(title = "PC1 versus PC2")
ggplotly(plot1)
# PC1 versus PC3
plot2 <- ggplot(pca_scores,
aes(x = PC1,
y = PC3,
col = Foodgroup)) +
geom_point() +
labs(title = "PC1 versus PC3")
ggplotly(plot2)
# PC2 versus PC3
plot3 <- ggplot(pca_scores,
aes(x = PC2,
y = PC3,
col = Foodgroup)) +
geom_point() +
labs(title = "PC2 versus PC3")
ggplotly(plot3)
Comments: After removing the outlier, the loadings for PC2 changed the most compared to PC1 and PC3. This is because the dataset changes the most variance towards the direction of PC2 after removing the outlier.
Description: According to the plots of the scores, the three food groups can be distinguished better after removing the outlier. Loadings of the PCs represent the weight of each variable on the corresponding directions. Since removing the outlier changes the loadings for PCs, it is reasonable to see the plots of the scores change as well.